#import libraries
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
#initate plotly
pyoff.init_notebook_mode()
#read data from csv and redo the data work we done before
tx_data = pd.read_excel('OnlineRetail.xlsx')
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
#create 3m and 6m dataframes
tx_3m = tx_uk[(tx_uk.InvoiceDate < datetime(2011,6,1)) & (tx_uk.InvoiceDate >= datetime(2011,3,1))].reset_index(drop=True)
tx_6m = tx_uk[(tx_uk.InvoiceDate >= datetime(2011,6,1)) & (tx_uk.InvoiceDate < datetime(2011,12,1))].reset_index(drop=True)
#create tx_user for assigning clustering
tx_user = pd.DataFrame(tx_3m['CustomerID'].unique())
tx_user.columns = ['CustomerID']
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#calculate recency score
tx_max_purchase = tx_3m.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
#calcuate frequency score
tx_frequency = tx_3m.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
#calcuate revenue score
tx_3m['Revenue'] = tx_3m['UnitPrice'] * tx_3m['Quantity']
tx_revenue = tx_3m.groupby('CustomerID').Revenue.sum().reset_index()
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
#overall scoring
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
tx_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 13313.0 | 50 | 1 | 34 | 0 | 631.60 | 0 | 1 | Low-Value |
| 1 | 16875.0 | 60 | 1 | 75 | 0 | 1326.13 | 0 | 1 | Low-Value |
| 2 | 16282.0 | 60 | 1 | 11 | 0 | 168.30 | 0 | 1 | Low-Value |
| 3 | 13963.0 | 59 | 1 | 22 | 0 | 397.56 | 0 | 1 | Low-Value |
| 4 | 17188.0 | 60 | 1 | 10 | 0 | 388.72 | 0 | 1 | Low-Value |
#calculate revenue and create a new dataframe for it
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']
tx_user_6m = tx_6m.groupby('CustomerID')['Revenue'].sum().reset_index()
tx_user_6m.columns = ['CustomerID','m6_Revenue']
#plot LTV histogram
plot_data = [
go.Histogram(
x=tx_user_6m.query('m6_Revenue < 10000')['m6_Revenue']
)
]
plot_layout = go.Layout(
title='6m Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Histogram clearly shows we have customers with negative LTV. We have some outliers too. Filtering out the outliers makes sense to have a proper machine learning model.
We will merge our 3 months and 6 months dataframes to see correlations between LTV and the feature set we have.
tx_merge = pd.merge(tx_user, tx_user_6m, on='CustomerID', how='left')
tx_merge = tx_merge.fillna(0)
tx_graph = tx_merge.query("m6_Revenue < 30000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Low-Value'")['m6_Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Mid-Value'")['m6_Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'High-Value'")['m6_Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "6m LTV"},
xaxis= {'title': "RFM Score"},
title='LTV'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We can see positive relation here, higher RFM score means higher LTV
Considering business part of this analysis, we need to treat customers differently based on their predicted LTV. For this example, we will apply clustering and have 3 segments (number of segments really depends on your business dynamics and goals):
#remove outliers
tx_merge = tx_merge[tx_merge['m6_Revenue']<tx_merge['m6_Revenue'].quantile(0.99)]
#creating 3 clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_merge[['m6_Revenue']])
tx_merge['LTVCluster'] = kmeans.predict(tx_merge[['m6_Revenue']])
#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm6_Revenue',tx_merge,True)
#creatinga new cluster dataframe
tx_cluster = tx_merge.copy()
#see details of the clusters
tx_cluster.groupby('LTVCluster')['m6_Revenue'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| LTVCluster | ||||||||
| 0 | 1515.0 | 439.667809 | 477.181202 | -823.50 | 0.00 | 300.55 | 741.475 | 1664.44 |
| 1 | 321.0 | 2911.984081 | 1146.387618 | 1667.12 | 2008.12 | 2555.46 | 3593.430 | 6410.56 |
| 2 | 37.0 | 10138.774054 | 2725.583432 | 6685.53 | 7761.38 | 9846.10 | 11095.560 | 15998.57 |
print ("Before dummy varibles")
tx_cluster.head()
Before dummy varibles
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | LTVCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13313.0 | 50 | 1 | 34 | 0 | 631.60 | 0 | 1 | Low-Value | 923.72 | 0 |
| 1 | 16875.0 | 60 | 1 | 75 | 0 | 1326.13 | 0 | 1 | Low-Value | 266.00 | 0 |
| 2 | 16282.0 | 60 | 1 | 11 | 0 | 168.30 | 0 | 1 | Low-Value | 0.00 | 0 |
| 3 | 13963.0 | 59 | 1 | 22 | 0 | 397.56 | 0 | 1 | Low-Value | 0.00 | 0 |
| 4 | 17188.0 | 60 | 1 | 10 | 0 | 388.72 | 0 | 1 | Low-Value | 960.64 | 0 |
#convert categorical columns to numerical
tx_class = pd.get_dummies(tx_cluster)
print ("After dummy Variables")
tx_class.head()
After dummy Variables
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | m6_Revenue | LTVCluster | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13313.0 | 50 | 1 | 34 | 0 | 631.60 | 0 | 1 | 923.72 | 0 | 0 | 1 | 0 |
| 1 | 16875.0 | 60 | 1 | 75 | 0 | 1326.13 | 0 | 1 | 266.00 | 0 | 0 | 1 | 0 |
| 2 | 16282.0 | 60 | 1 | 11 | 0 | 168.30 | 0 | 1 | 0.00 | 0 | 0 | 1 | 0 |
| 3 | 13963.0 | 59 | 1 | 22 | 0 | 397.56 | 0 | 1 | 0.00 | 0 | 0 | 1 | 0 |
| 4 | 17188.0 | 60 | 1 | 10 | 0 | 388.72 | 0 | 1 | 960.64 | 0 | 0 | 1 | 0 |
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)
LTVCluster 1.000000 m6_Revenue 0.850448 Revenue 0.407058 Frequency 0.332215 OverallScore 0.298391 RevenueCluster 0.281579 FrequencyCluster 0.281295 Segment_High-Value 0.246524 RecencyCluster 0.224109 Segment_Mid-Value 0.202589 CustomerID -0.033922 Recency -0.218041 Segment_Low-Value -0.246278 Name: LTVCluster, dtype: float64
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']
#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)
#XGBoost Multiclassification Model
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1,objective= 'multi:softprob',n_jobs=-1, use_label_encoder=False).fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))
y_pred = ltv_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
[17:45:53] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Accuracy of XGB classifier on training set: 0.93
Accuracy of XGB classifier on test set: 0.83
precision recall f1-score support
0 0.86 0.95 0.90 76
1 0.56 0.33 0.42 15
2 1.00 0.33 0.50 3
accuracy 0.83 94
macro avg 0.80 0.54 0.61 94
weighted avg 0.81 0.83 0.81 94
C:\Users\baps\anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
Precision and recall are acceptable for 0. As an example, for cluster 0 (Low LTV), if model tells us this customer belongs to cluster 0, 90 out of 100 will be correct (precision). And the model successfully identifies 93% of actual cluster 0 customers (recall). We really need to improve the model for other clusters. For example, we barely detect 42% of Mid LTV customers. Possible actions to improve those points: